#!/usr/bin/env Rscript

#install R libs on server
#install.packages("dplyr", lib="~/mylibs/R")
#.libPaths("~/mylibs/R")
require(data.table)
library(ggplot2)
library(dplyr)
library(plyr)

#############################
############################# Monte Carlo Resampling/Bootstrapping
#############################

#change to full local path to the replication data
wd1 <- "C:\\Users\\YFan\\OneDrive - NORCE\\NFood_replication\\"

crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")
scenario = c("RCP45 - RCP85", "SAI - RCP85", "MSB - RCP85", "CCT - RCP85")


#==========alternative MLR to test interaction effects, and use log change (better than % change) in RADD and RADI as predictors for radiative effect
dif.8crop.s <- readRDS(file = paste0(wd1,"/data/RData/climate-yield-changedata-MLR-selected.Rds"))
train.data <- dif.8crop.s[Scenario=="SAI - RCP85" & Irrig==FALSE & Crop=="Sugarcane" & area.wt>20000] 
train.data <- dif.8crop.s[Scenario=="SAI - RCP85" & Irrig==FALSE & Sample==18098]
#ggplot(train.data, aes(tsa.dif, yield.dif) ) + #slight nonlinear
#ggplot(train.data, aes(rh.dif, yield.dif) ) +  #slight nonlinear (rh.dif (%) has better distribution than rh.log)
ggplot(train.data, aes(ppt.log, yield.dif) ) + #almost flat with ppt.dif, but near linear with ppt.log and ppt.difn
  #ggplot(train.data, aes(radd.dif, yield.dif) ) + #almost linear
  #ggplot(train.data, aes(radi.dif, yield.dif) ) + #almost linear
  geom_point() +
  stat_smooth()
rm(train.data)

(crop <- unique(dif.8crop.s$Crop))
(scenario <- unique(dif.8crop.s$Scenario))
models.irrig3 <- dif.8crop.s[Irrig==TRUE]  %>% 
  mutate(Crop = factor(Crop, levels=crop)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), function(df)lm(yield.dif ~ tsa.dif+radd.log+radi.log, data = df, na.action = na.omit))
models.rainfed3 <- dif.8crop.s[Irrig==FALSE ]  %>% 
  mutate(Crop = factor(Crop, levels=crop)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), function(df)lm(yield.dif ~ tsa.dif+radd.log+radi.log+rh.dif+ppt.log, data = df, na.action = na.omit))
models.rainfed33 <- dif.8crop.s[Irrig==FALSE ]  %>% 
  mutate(Crop = factor(Crop, levels=crop)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), function(df)lm(yield.dif ~ tsa.dif*rh.dif+radd.log+radi.log+ppt.log, data = df, na.action = na.omit))
models.rainfed333 <- dif.8crop.s[Irrig==FALSE ]  %>% 
  mutate(Crop = factor(Crop, levels=crop)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), 
        function(df)lm(yield.dif ~ tsa.dif+rh.dif+radd.log+radi.log+ppt.log+tsa.dif:rh.dif+ppt.log:rh.dif, data = df, na.action = na.omit))

coeff.i.samples3 <- as.data.table(ldply(models.irrig3, coef)) 
coeff.r.samples3 <- as.data.table(ldply(models.rainfed3, coef))
coeff.r.samples33 <- as.data.table(ldply(models.rainfed33, coef))
coeff.r.samples333 <- as.data.table(ldply(models.rainfed333, coef))
names(coeff.i.samples3)[-(1:3)] <- c("Intercept", "T", "RD", "RI")
names(coeff.r.samples3)[-(1:3)] <- c("Intercept", "T", "RD", "RI", "RH", "P") #RH,P for moisture/water availability
names(coeff.r.samples33)[-(1:3)] <- c("Intercept", "T", "RH", "RD", "RI", "P", "TRH") #TRH for the interaction term of T*RH
names(coeff.r.samples333)[-(1:3)] <- c("Intercept", "T", "RH", "RD", "RI", "P", "TRH", "RHP") #RHP for the interaction term of RH*P

coeff.i.samples3.m <- coeff.i.samples3[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI)), by=.(Scenario,Crop)]
coeff.r.samples3.m <- coeff.r.samples3[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI), RH=mean(RH), P=mean(P) ), by=.(Scenario,Crop)]
coeff.r.samples33.m <-coeff.r.samples33[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI), RH=mean(RH), P=mean(P), TRH=mean(TRH) ), by=.(Scenario,Crop)]

coeff.i.samples3.m[Scenario=="SAI - RCP85", RI/RD, by=.(Scenario,Crop)] #only tropical corn and soybean have RI/RD >1
coeff.r.samples3.m[Scenario=="SAI - RCP85", RI/RD, by=.(Scenario,Crop)] #rainfed crops all have 0< RI/RD <1
coeff.i.samples3.m[Scenario=="SAI - RCP85", mean(RI/RD), by=.(Scenario)] #average 0.82 for irrigated
coeff.r.samples3.m[Scenario=="SAI - RCP85", mean(RI/RD), by=.(Scenario)] #average 0.52 for rainfed
#when log changes in RADD/RADI are used as predictor, most crops have diffuse:direct effect ratio <1, meaning that
#if the direct and diffuse component decrease (e.g., -20%) and increase (e.g., +20%) proportionally, like what happened after Mount Pinatubo volcanic eruption,
#the negative effect of reduced direct radiation will overtake the positive diffuse radiation effect, resulting in overall negative insolation effect in SAI 
#our total insolation effect is near zero because actual changes in direct (-13%) and diffuse (+23%) lights in our experiments are non-symmetrical (Fig. 1c), resulting in slight positive effects of insolation change for most crops 

library(broom) #get p-value and other model statistics (AIC, BIC)
# tidy(lm)     #get coefficient table as a data frame
# glance(lm)   #get rest of stats as a data frame
# glance(lm)$p.value # p.value of the whole model (glm does not have p.value for the model)
coef.table.i3 <- as.data.table(ldply(models.irrig3, tidy))     #coefficient table of individual regressions
coef.table.r3 <- as.data.table(ldply(models.rainfed3, tidy))   #including error terms, p-values for each parameter.
stats.i3 <- as.data.table(ldply(models.irrig3, glance))    #all other model statistics r.squared, AIC, BIC etc.
stats.r3 <- as.data.table(ldply(models.rainfed3, glance))  #and p.value of each MLR model
stats.r33 <- as.data.table(ldply(models.rainfed33, glance)) #and interactions
stats.r333 <- as.data.table(ldply(models.rainfed333, glance)) #and interactions
stats.i3[, .(AIC.m=mean(AIC)),by=.(Scenario,Crop)]
stats.r3[, .(AIC.m=mean(AIC)),by=.(Scenario)]    #simple lm has larger AIC, glm gives the same coefficients as lm, but slightly different AIC
stats.r33[, .(AIC.m=mean(AIC)),by=.(Scenario)]   #with interaction term T*RH, smaller AIC than simple lm and than including both T*RH and P*RH
stats.r333[, .(AIC.m=mean(AIC)),by=.(Scenario)]  #higher AIC than above
length(unique(stats.i3[p.value<0.05]$Sample))/length(unique(stats.i3$Sample))*100 #92.4% majority of grid samples have at least one parameter with p.value < 0.1
length(unique(stats.r3[p.value<0.05]$Sample))/length(unique(stats.r3$Sample))*100 #98%
stats.i3[, .(r.squared.m=mean(r.squared)),by=.(Scenario,Crop)]
stats.r3[, .(r.squared.m=mean(r.squared)),by=.(Scenario,Crop)]
# badcells <- unique(stats.r3[is.na(r.squared)]$Sample) #939 grid cells have non-valid R2 and AIC for rainfed crops
# badcells <- unique(stats.r3[is.infinite(AIC)]$Sample)
# length(badcells)
# summary(models.rainfed3$`RCP45 - RCP85.Corn.18088`)
# dif.8crop.s[Scenario=="SAI - RCP85" & Crop=="Corn" & Irrig==FALSE & Sample==18088]$yield.dif #bad cells have response variable yield.dif=0!
##See Data2.Input-for-MLR-analysis.R: remove grid cells that have yield.dif=0 across all years (which cause invalid MLR) 

save(list = c("stats.i3", "stats.r3", "stats.r33", "stats.r333", "coeff.i.samples3", "coeff.r.samples3", "coeff.r.samples33", "coeff.r.samples333"), 
     file = paste0(wd1,"/data/RData/Fig.ED4.MLR-pergrid-rad-ppt.log.Rdata")) #save MLR coefficients 
rm(models.rainfed3, models.irrig3,models.rainfed33, models.rainfed333)




#==============get mean partial and total effects and their confidence intervals by resampling===================
#read regression coefficients
load(paste0(wd1,"/data/RData/Fig.ED4.MLR-pergrid-rad-ppt.log.Rdata")) #coefficients when log change of RADD and RADI are predictors
coeff.i.sai <- coeff.i.samples3[Scenario=="SAI - RCP85"]
coeff.r.sai <- coeff.r.samples333[Scenario=="SAI - RCP85"]

#interannual variability under RCP85 (absolute change): -1sd T ~ -1.5 [°C], +1sd RH ~ +3.6 unit RH [%], -1sd P ~ -22 P [mm/month], -1sd RD ~ -5.5 [W/m^2], +1sd RI ~ 1.4 [W/m^2]
#interannual variability of SAI-RCP85: -1sd T ~ -1.06 [°C], +1sd RH ~ 4.4 [%] or 0.08 [log], -1sd P ~ -0.31 P [log], -1sd RD ~ -0.096 [log], +1sd RI ~ 0.08 [log]
dif.8crop.sai <- copy(dif.8crop.s[Scenario=="SAI - RCP85"])
dif.8crop.sai <- dif.8crop.sai[, tsa.sd:=sd(tsa.dif), by=.(Crop,Irrig,Sample)]  #absolute terms for T and RH
dif.8crop.sai <- dif.8crop.sai[, rh.sd:=sd(rh.dif), by=.(Crop,Irrig,Sample)]
dif.8crop.sai <- dif.8crop.sai[, radd.sd:=sd(radd.log), by=.(Crop,Irrig,Sample)] #log terms for RD, RI, P
dif.8crop.sai <- dif.8crop.sai[, radi.sd:=sd(radi.log), by=.(Crop,Irrig,Sample)] #because they have log linear relation with yield.dif (log)
dif.8crop.sai <- dif.8crop.sai[, ppt.sd:=sd(ppt.log), by=.(Crop,Irrig,Sample)]

#use absolute changes in T and RH to estimate the effects of "-1 [°C] T", "+5 [%] RH", but log changes in RD, RI and P (they have log linear relationship with yield.dif)
#also show +- sd in all variables to estimate the effects of "-1sd T", "-1sd RD", "+1sd RI", "-1sd P", "+1sd RH"

#unit changes should be named c("-1 [°C] T", "-10% RD", "+10% RI", "-10% P", "+5 [%] RH"), and c("-1sd T", "-1sd RD", "+1sd RI", "-1sd P", "+1sd RH") respectively
dif.unit <- dif.8crop.sai[,.(tsa.unit1=-1, rh.unit1=5, radd.unit1=log(1-0.1), radi.unit1=log(1+0.1), ppt.unit1=log(1-0.1), 
                             #tsa.unit1=tsa.dif*scaler$tsa, ppt.unit1=ppt.log*scaler$ppt, rh.unit1=rh.log*scaler$rh,radd.unit1=radd.log*scaler$radd, radi.unit1=radi.log*scaler$radi,
                             tsa.unit2=-tsa.sd, ppt.unit2=-ppt.sd, rh.unit2=rh.sd, radd.unit2=-radd.sd, radi.unit2=radi.sd, #alternative: 1sd of interannual variability of log difference of SAI-RCP85 as unit changes 
                             yield0, area.wt,Year,Crop,Irrig,Sample)]
sens.i.sai <- coeff.i.sai[dif.unit[Irrig==TRUE],                                    
                          .(Intercept, T.log1=T*i.tsa.unit1, RD.log1=RD*i.radd.unit1, RI.log1=RI*i.radi.unit1, RH.log1=0, P.log1=0, TRH.log1=0, RHP.log1=0,
                            T.log2=T*i.tsa.unit2, RD.log2=RD*i.radd.unit2, RI.log2=RI*i.radi.unit2, RH.log2=0, P.log2=0, TRH.log2=0, RHP.log2=0,
                            yield0=i.yield0, area=i.area.wt, Year=i.Year), by=.EACHI, on=.(Crop,Sample)]
sens.r.sai <- coeff.r.sai[dif.unit[Irrig==FALSE], 
                          .(Intercept, T.log1=T*i.tsa.unit1, RD.log1=RD*i.radd.unit1, RI.log1=RI*i.radi.unit1, P.log1=P*i.ppt.unit1, RH.log1=RH*i.rh.unit1, 
                            T.log2=T*i.tsa.unit2, RD.log2=RD*i.radd.unit2, RI.log2=RI*i.radi.unit2, P.log2=P*i.ppt.unit2, RH.log2=RH*i.rh.unit2, 
                            TRH.log1=TRH*i.rh.unit1*i.tsa.unit1, TRH.log2=TRH*i.rh.unit2*i.tsa.unit2, 
                            #RDRI.log1=RDRI*i.radd.unit1*i.radi.unit1, RDRI.log2=RDRI*i.radd.unit2*i.radi.unit2,
                            RHP.log1=RHP*i.rh.unit1*i.ppt.unit1, RHP.log2=RHP*i.rh.unit2*i.ppt.unit2,
                            yield0=i.yield0,area=i.area.wt, Year=i.Year), by=.EACHI, on=.(Crop,Sample)]
sens.i.sai  <- sens.i.sai[,Tot.log1:=Intercept+T.log1+RD.log1+RI.log1] 
sens.r.sai  <- sens.r.sai[,Tot.log1:=Intercept+T.log1+RD.log1+RI.log1+P.log1+RH.log1+TRH.log1+RHP.log1]
sens.i.sai  <- sens.i.sai[,Tot.log2:=Intercept+T.log2+RD.log2+RI.log2] 
sens.r.sai  <- sens.r.sai[,Tot.log2:=Intercept+T.log2+RD.log2+RI.log2+P.log2+RH.log2+TRH.log2+RHP.log2]

sens.i.sai$Irrig <- TRUE
sens.r.sai$Irrig <- FALSE
sens.sai <- rbind(sens.i.sai, sens.r.sai, fill=TRUE)

sens.sai.wm <- sens.sai[, .(T.log1=weighted.mean(T.log1, area, na.rm = T), RD.log1=weighted.mean(RD.log1, area, na.rm = T),RI.log1=weighted.mean(RI.log1, area, na.rm = T),
                            RH.log1=weighted.mean(RH.log1, area, na.rm = T), P.log1=weighted.mean(P.log1, area, na.rm = T), Tot.log1=weighted.mean(Tot.log1, area, na.rm = T), 
                            T.log2=weighted.mean(T.log2, area, na.rm = T), RD.log2=weighted.mean(RD.log2, area, na.rm = T), RI.log2=weighted.mean(RI.log2, area, na.rm = T),
                            RH.log2=weighted.mean(RH.log2, area, na.rm = T), P.log2=weighted.mean(P.log2, area, na.rm = T), Tot.log2=weighted.mean(Tot.log2, area, na.rm = T),
                            R.log1=weighted.mean(RD.log1+RI.log1, area, na.rm = T), R.log2=weighted.mean(RD.log2+RI.log2, area, na.rm = T), 
                            TRH.log1=weighted.mean(TRH.log1, area, na.rm = T), TRH.log2=weighted.mean(TRH.log2, area, na.rm = T), 
                            RHP.log1=weighted.mean(RHP.log1, area, na.rm = T), RHP.log2=weighted.mean(RHP.log2, area, na.rm = T), 
                            yield0=weighted.mean(yield0, area, na.rm = T), area=sum(area, na.rm=T)),
                        by =.(Crop,Irrig,Year)]
sens.sai.wm$Level <- "Mean"



#=======bootstrap resampling to get probability distribution of partial and total climate effects
#library(boot)
library(dplyr)
library(rsample)
library(broom)
library(purrr) #use purrr::map to apply a function to all the bootstrap samples at once


#=define helper function to obtain crop weighted average climate effect per year using per-grid regression
bs.wtm <- function(split) {
  #=======resample irrig/rainfed crops together
  sample.s <- analysis(split)  #boots sample from one scenario within each crop type based on sample.data

  #==sample predicted effects using the sample ID from above
  sens.log.s <- data[sample.s[,c("Scenario","Crop","Sample")], on=.(Crop,Sample)] #use the same sample indices of each crop for all scenarios
  
  #==weighted by area gives best estimate of global effect
  sens.wm <- sens.log.s[, .(T.log1=weighted.mean(T.log1, area, na.rm = T), RD.log1=weighted.mean(RD.log1, area, na.rm = T),RI.log1=weighted.mean(RI.log1, area, na.rm = T),
                            RH.log1=weighted.mean(RH.log1, area, na.rm = T), P.log1=weighted.mean(P.log1, area, na.rm = T), Tot.log1=weighted.mean(Tot.log1, area, na.rm = T), 
                            T.log2=weighted.mean(T.log2, area, na.rm = T), RD.log2=weighted.mean(RD.log2, area, na.rm = T), RI.log2=weighted.mean(RI.log2, area, na.rm = T),
                            RH.log2=weighted.mean(RH.log2, area, na.rm = T), P.log2=weighted.mean(P.log2, area, na.rm = T), Tot.log2=weighted.mean(Tot.log2, area, na.rm = T), 
                            TRH.log1=weighted.mean(TRH.log1, area, na.rm = T), TRH.log2=weighted.mean(TRH.log2, area, na.rm = T), 
                            RHP.log1=weighted.mean(RHP.log1, area, na.rm = T), RHP.log2=weighted.mean(RHP.log2, area, na.rm = T), 
                            yield0=weighted.mean(yield0, area, na.rm = T), area=sum(area, na.rm=T)),
                          by =.(Crop,Year)]
  return(sens.wm)
}


set.seed(27)
sample.data = coeff.i.sai
boots.i <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop to obtain sample IDs

sample.data = coeff.r.sai
boots.r <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop type

#===bootstrapping process (takes ~100mins for 1000 times)
data <- sens.i.sai #external input (log) to function bs.wtm; this is the real data to bootstrap
boot_stats_i <- boots.i %>%
  mutate(stats = map(splits, bs.wtm)) #splits is just for getting sample ID of one scenario based on sample.data
  #stats_tidy = map(stats, tidy))
data <- sens.r.sai #external input (log) to function bs.wtm
boot_stats_r <- boots.r %>%
  mutate(stats = map(splits, bs.wtm)) #eff.r.samples is the real data to bs.wtm

#save(list=c("boot_stats_i","boot_stats_r"), file = paste0(wd1,"/Scripts/revision/MLR-pergrid-sensitivity-SAI-boot_stats_v3.Rdata"))

rm(sens.i.sai, sens.r.sai)



#===confidence interval for irrigated crops
alpha <- .05 #95% confidence interval
eff.i.low <- boot_stats_i %>%
  unnest(stats) %>%
  dplyr::group_by( Crop, Year) %>%
  dplyr::summarise_at(c("T.log1","RD.log1","RI.log1","P.log1","RH.log1","TRH.log1","RHP.log1","Tot.log1",
                        "T.log2","RD.log2","RI.log2","P.log2","RH.log2","TRH.log2","RHP.log2","Tot.log2"), 
                      quantile, probs = alpha/2, na.rm = TRUE)
eff.i.high <- boot_stats_i %>%
  unnest(stats) %>%
  dplyr::group_by( Crop, Year) %>%
  dplyr::summarise_at(c("T.log1","RD.log1","RI.log1","P.log1","RH.log1","TRH.log1","RHP.log1","Tot.log1",
                        "T.log2","RD.log2","RI.log2","P.log2","RH.log2","TRH.log2","RHP.log2","Tot.log2"),  
                      quantile, probs = 1 - alpha/2, na.rm = TRUE)

eff.i.low$Level="Conf.L"
eff.i.high$Level="Conf.U"
eff.i.ci <- as.data.table(rbind(eff.i.low, eff.i.high)) #%>%
#  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
#  setorder(Scenario)

#===rainfed crops
alpha <- .05 #95% confidence interval (2.5th to 97.5th percentiles)
eff.r.low <- boot_stats_r %>%
  unnest(stats) %>%
  dplyr::group_by( Crop, Year) %>%
  dplyr::summarise_at(c("T.log1","RD.log1","RI.log1","P.log1","RH.log1","TRH.log1","RHP.log1","Tot.log1",
                        "T.log2","RD.log2","RI.log2","P.log2","RH.log2","TRH.log2","RHP.log2","Tot.log2"), 
                      quantile, probs = alpha/2, na.rm = TRUE)
eff.r.high <-boot_stats_r %>%
  unnest(stats) %>%
  dplyr::group_by( Crop, Year) %>%
  dplyr::summarise_at(c("T.log1","RD.log1","RI.log1","P.log1","RH.log1","TRH.log1","RHP.log1","Tot.log1",
                        "T.log2","RD.log2","RI.log2","P.log2","RH.log2","TRH.log2","RHP.log2","Tot.log2"), 
                      quantile, probs = 1 - alpha/2, na.rm = TRUE)

eff.r.low$Level="Conf.L"
eff.r.high$Level="Conf.U"
eff.r.ci <- as.data.table(rbind(eff.r.low, eff.r.high)) #%>%
#  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
#  setorder(Scenario)


#====final step: combine mean and confidence interval
eff.i.ci$Irrig=TRUE
eff.r.ci$Irrig=FALSE
sens.ci.boots <- rbind(eff.i.ci, eff.r.ci)
sens.ci.boots <- as.data.table(sens.ci.boots)

sens.sai.wm$Level <- "Mean"
sens.log.boots <- rbind(sens.sai.wm, sens.ci.boots, fill=TRUE)
#add mean base yield and area (from dif.8crop.s) to levels Conf.L and Conf.U
sens.log.boots <- sens.log.boots[sens.sai.wm, yield0:=i.yield0, on=.(Crop,Irrig,Year)]
sens.log.boots <- sens.log.boots[sens.sai.wm, area:=i.area, on=.(Crop,Irrig,Year)]


save(list=c("sens.sai.wm", "sens.ci.boots", "sens.log.boots"), file = paste0(wd1,"/data/RData/Fig.ED4.MLR-pergrid-sensitivity-SAI-Bootstrap.Rdata"))


#========end of Monte Carlo resampling/bootstrapping============

